Practice Lesson 2: Inductive Analytics

Packages

## load required libraries
library(tidyverse)
library(quanteda)
library(lexicon)
library(reshape2)
library(stringi)
library(quanteda.textplots)
library(quanteda.textmodels)
library(quanteda.textstats)
library(gridExtra)
library(seededlda)
library(ggrepel)
library(ggdendro)
library(factoextra)
library(lattice)
library(spacyr)

Clean workspace and set working directory

## clean workspace
rm(list=ls())
## set working directory (WD)
path <- '~/coliphi21/practice_lessons/lesson_2/src/'
## you can also set it dynamically: 
## setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd(path)
## check that WD is set correctly
getwd()
## [1] "/Users/lucienbaumgartner/coliphi21/practice_lessons/lesson_2/src"

Import data

For this tutorial you can either work with your own data, or the pre-built copora provided in the /input-folder for the first practice session. The quanteda-package also contains pre-built corpora you can use. For this session, we scraped the Stanford Encyclopedia of Philosophy and built a corpus including additional metadata.

## relative path
load('../input/stanford-encyclopedia.RDS')
## absolute path
load('~/coliphi21/practice_lessons/lesson_2/input/stanford-encyclopedia.RDS')

If you work with your own corpus

## either
sfe <- readtext('path/to/you/data')
## or
sfe <- readtext('path/to/you/data', text_field = 'name_of_text_var', docid_field = 'id_var')
## then
sfe <- corpus(sfe)

Disclaimer

Loading the data above will import a pre-built corpus object into R, which is called sfe.

Inspect data

## how does the corpus object look like?
sfe
## Corpus consisting of 1,712 documents and 21 docvars.
## 18thGerman-preKant.json :
## " In Germany, the eighteenth century was the age of enlighten..."
## 
## abduction.json :
## " In the philosophical literature, the term abduction is used..."
## 
## abelard.json :
## " Peter Abelard (1079–21 April 1142) [Abailard or Abaelard or..."
## 
## abhidharma.json :
## " The first centuries after Śākyamuni Buddha death saw the ri..."
## 
## abilities.json :
## " In the accounts we give of one another, claims about our ab..."
## 
## abner-burgos.json :
## " Abner of Burgos (Alfonso de Valladolid; c. 1260–1347) was p..."
## 
## [ reached max_ndoc ... 1,706 more documents ]
## summary statistics
summary(sfe) %>% head
## available variables
docvars(sfe)

> Exercise

Familiarize yourself a little more with the data.

Prep

## tokenization
toks <- tokens(sfe, what = 'word',
               remove_punct = T, remove_symbols = T, padding = F, 
               remove_numbers = T, remove_url = T)
?tokens
## to lower
toks <- tokens_tolower(toks)
## lemmatizing
toks <- tokens_replace(toks, 
                       pattern = lexicon::hash_lemmas$token, 
                       replacement = lexicon::hash_lemmas$lemma)
## remove stopwords
toks <- tokens_select(toks,  pattern = stopwords("en"), selection = "remove")
## remove noise
toks <- tokens_select(toks, pattern = '[0-9]+|^.$', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>% 
           dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                    max_docfreq = 0.1, docfreq_type = "prop")
dfm_sfe
## Document-feature matrix of: 1,712 documents, 24,689 features (98.47% sparse) and 21 docvars.
##                          features
## docs                      ethos immanuel thomasius pietist thomasians wolff well dis halle pietism
##   18thGerman-preKant.json     2        1        33       6         11    36    0   1    19       7
##   abduction.json              0        0         0       0          0     0    0   0     0       0
##   abelard.json                0        0         0       0          0     0    0   0     0       0
##   abhidharma.json             0        0         0       0          0     0    0   0     0       0
##   abilities.json              0        0         0       0          0     0    0   0     0       0
##   abner-burgos.json           0        0         0       0          0     0    0   0     0       0
## [ reached max_ndoc ... 1,706 more documents, reached max_nfeat ... 24,679 more features ]

> Exercise

Task

Check whether there is still some noise in the data and remove it. Hint: Scan through the topfeatures.

Solution
topfeatures(dfm_sfe, n=200)
## remove additional words
toks <- tokens_select(toks, pattern = 'many|much', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>% 
           dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                    max_docfreq = 0.1, docfreq_type = "prop")

Scaling: correspondence analysis

## compute model
sfe_ca <- textmodel_ca(dfm_sfe)
## coerce model coefficients to dataframe
str(sfe_ca)
## List of 15
##  $ sv        : num [1:22] 0.763 0.737 0.715 0.701 0.696 ...
##  $ nd        : num 22
##  $ rownames  : chr [1:1712] "18thGerman-preKant.json" "abduction.json" "abelard.json" "abhidharma.json" ...
##  $ rowmass   : num [1:1712] 0.000389 0.000374 0.000536 0.000755 0.000404 ...
##  $ rowdist   : num [1:1712] 20.2 19.5 14.5 17.2 14.6 ...
##  $ rowinertia: num [1:1712] 0.1585 0.1417 0.1129 0.2247 0.0867 ...
##  $ rowcoord  : num [1:1712, 1:22] 0.56 -0.784 0.48 1.247 -0.201 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:1712] "18thGerman-preKant.json" "abduction.json" "abelard.json" "abhidharma.json" ...
##   .. ..$ : chr [1:22] "Dim1" "Dim2" "Dim3" "Dim4" ...
##  $ rowsup    : logi(0) 
##  $ colnames  : chr [1:24689] "ethos" "immanuel" "thomasius" "pietist" ...
##  $ colmass   : num [1:24689] 5.67e-05 1.05e-04 3.11e-05 2.93e-05 6.71e-06 ...
##  $ coldist   : num [1:24689] 7.7 4.52 33.32 13.48 50.69 ...
##  $ colinertia: num [1:24689] 0.00336 0.00215 0.03453 0.00532 0.01723 ...
##  $ colcoord  : num [1:24689, 1:22] 0.522 0.331 0.763 0.822 0.734 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:24689] "ethos" "immanuel" "thomasius" "pietist" ...
##   .. ..$ : chr [1:22] "Dim1" "Dim2" "Dim3" "Dim4" ...
##  $ colsup    : logi(0) 
##  $ call      : language textmodel_ca.dfm(x = dfm_sfe)
##  - attr(*, "class")= chr [1:3] "textmodel_ca" "ca" "list"
sfe_ca <- data.frame(dim1 = coef(sfe_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(sfe_ca, doc_dim = 2)$coef_document)
str(sfe_ca)
## 'data.frame':    1712 obs. of  2 variables:
##  $ dim1: num  0.56 -0.784 0.48 1.247 -0.201 ...
##  $ dim2: num  0.125 -0.274 0.889 0.617 -0.101 ...
sfe_ca$id <- gsub('\\.json.*', '', rownames(sfe_ca))
head(sfe_ca)
## plot full data with branch annotation
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=dim1-dim2), alpha = 0.2) +
  # plot 0.2 of all labels, using a repel function
  geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 15, seed = 6734) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis: Full Data')

## plot parts of the data
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=dim1-dim2), alpha = 0.2) +
  # plot 0.2 of all labels, using a repel function
  geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 9, seed = 6734) +
  scale_y_continuous(limits=c(-2,0)) +
  scale_x_continuous(limits=c(-1,1)) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis: Zoom')

Unsupervised LDA

## run naive unsupervised topic model with 10 topics
set.seed(123)
sfe_lda <- textmodel_lda(dfm_sfe, k = 10)
## print top 20 terms per topic
terms(sfe_lda, 20)
##       topic1          topic2         topic3           topic4           topic5        topic6        topic7             topic8         topic9        topic10     
##  [1,] "supervenience" "gene"         "disability"     "turing"         "spacetime"   "ockham"      "chinese"          "heidegger"    "privacy"     "ibn"       
##  [2,] "trope"         "molecular"    "oppression"     "gödel"          "einstein"    "bacon"       "spinoza"          "dewey"        "theism"      "avicenna"  
##  [3,] "monism"        "neural"       "african"        "algebra"        "kuhn"        "scotus"      "reid"             "husserl"      "torture"     "buddhist"  
##  [4,] "fictional"     "simulation"   "racial"         "intuitionistic" "popper"      "pythagoras"  "dao"              "du"           "user"        "maimonides"
##  [5,] "bolzano"       "dna"          "coercion"       "computation"    "reichenbach" "boethius"    "nietzsche"        "malebranche"  "clinical"    "arabic"    
##  [6,] "brentano"      "darwin"       "feminism"       "ordinal"        "weyl"        "pythagorean" "confucian"        "artist"       "whitehead"   "averroes"  
##  [7,] "goodman"       "fitness"      "capability"     "hilbert"        "gravity"     "parmenides"  "sidgwick"         "berlin"       "delusion"    "japanese"  
##  [8,] "physicalism"   "inheritance"  "domination"     "cardinal"       "hole"        "proclus"     "utilitarianism"   "artistic"     "theist"      "buddha"    
##  [9,] "strawson"      "fodor"        "pornography"    "tarski"         "ramsey"      "cicero"      "mohists"          "spinoza"      "theistic"    "buddhism"  
## [10,] "noun"          "ai"           "distributive"   "peirce"         "bayesian"    "philo"       "conscience"       "wolff"        "hartshorne"  "dharma"    
## [11,] "bradley"       "artifact"     "marx"           "algorithm"      "entropy"     "plotinus"    "consequentialist" "nietzsche"    "enhancement" "islamic"   
## [12,] "austin"        "imagery"      "egalitarian"    "algebraic"      "bohr"        "sextus"      "luck"             "herder"       "doxastic"    "indian"    
## [13,] "armstrong"     "biologist"    "dworkin"        "recursive"      "newtonian"   "porphyry"    "laozi"            "clarke"       "omnipotent"  "emptiness" 
## [14,] "chisholm"      "drift"        "coercive"       "brouwer"        "payoff"      "abelard"     "zhuangzi"         "sartre"       "embryo"      "mystical"  
## [15,] "entailment"    "biodiversity" "sovereign"      "provable"       "feyerabend"  "bce"         "zhu"              "romantic"     "suicide"     "nishida"   
## [16,] "plural"        "cancer"       "income"         "zfc"            "dynamical"   "iamblichus"  "thick"            "bois"         "internalism" "japan"     
## [17,] "intension"     "quale"        "constitutional" "computable"     "bell"        "fr"          "hutcheson"        "artwork"      "gratitude"   "zen"       
## [18,] "implicature"   "adaptation"   "liberalism"     "cantor"         "gas"         "luther"      "confucius"        "fichte"       "engine"      "hebrew"    
## [19,] "grice"         "genome"       "anderson"       "diagram"        "mach"        "timaeus"     "relativism"       "schopenhauer" "csm"         "vasubandhu"
## [20,] "meinong"       "digital"      "republican"     "definable"      "bet"         "sophist"     "wang"             "collins"      "goldman"     "al-fārābī"
## plot the topics over the correspondence analysis data
sfe_ca$topics <- topics(sfe_lda)
ggplot(sfe_ca, aes(x=dim1, y=dim2, color=topics)) +
  geom_point(alpha = 0.5, shape = '.') +
  geom_density_2d(alpha = 0.5) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis with Topic Annotation (k=10)')

> Exercise

Task

Change the names of the topics (to some meaningful description) before plotting.

Solution
sfe_ca$topics <- recode(sfe_ca$topics, topic1 = "body-mind", topic2 = "biology", 
                        topic3 = "feminism/critical thinking", topic4 = "math/ai", 
                        topic5 = "physics", topic6 = "classics", topic7 = "eastern",
                        topic8 = "phenomenology", topic9 = "religion", 
                        topic10 = "middle-eastern/eastern")

PoS-tagging - leaving the sandbox

## set seed
set.seed(48621)
## draw a random sample of 20 documents
sfe_sub <- sfe[sample(1:length(sfe), 5)]
sfe_sub
## Corpus consisting of 5 documents and 21 docvars.
## albert-saxony.json :
## " Albert of Saxony (ca. 1320–1390), Master of Arts at Paris, ..."
## 
## contractarianism.json.1 :
## " Contractarianism names both a political theory of the legit..."
## 
## preferences.json.1 :
## " The notion of preference has a central role in many discipl..."
## 
## plotinus.json :
## " Plotinus (204/5 – 270 C.E.), is generally regarded as the f..."
## 
## paternalism.json :
## " Paternalism is the interference of a state or an individual..."
## PoS-tagging
sfe_pos <- spacy_parse(sfe_sub, pos = T, tag = T, lemma = T, entity = T, dependency = T)
sfe_pos

Augment your sandbox

## aggregate tokens and pos-tags back to documents
sfe_pos <- sfe_pos %>% rowwise %>% mutate(token_pos = paste0(token,'__', pos)) 
sfe_pos
sfe_pos <- sfe_pos %>% 
  group_by(doc_id) %>% 
  summarise(text = paste0(token_pos, collapse = ' '))
sfe_pos
## import it to quanteda and add metadata
sfe_pos <- corpus(sfe_pos)
docvars(sfe_pos) <- docvars(sfe_sub)
sfe_pos
## Corpus consisting of 5 documents and 21 docvars.
## albert-saxony.json :
## " __SPACE Albert__PROPN of__ADP Saxony__PROPN (__PUNCT ca__PR..."
## 
## contractarianism.json.1 :
## " __SPACE Contractarianism__PROPN names__NOUN both__CCONJ a__..."
## 
## paternalism.json :
## " __SPACE Paternalism__PROPN is__AUX the__DET interference__N..."
## 
## plotinus.json :
## " __SPACE Plotinus__PROPN (__PUNCT 204/5__NUM –__PUNCT 270__N..."
## 
## preferences.json.1 :
## " __SPACE The__DET notion__NOUN of__ADP preference__NOUN has_..."
## get all the nouns preceded by the adjective 'rational'
rational_noun <- stri_match_all(sfe_pos, regex = '(?<=rational__ADJ\\s)[A-z]+__NOUN')
names(rational_noun) <- docnames(sfe_pos)
rational_noun
## $`albert-saxony.json`
##      [,1]
## [1,] NA  
## 
## $contractarianism.json.1
##       [,1]               
##  [1,] "assessment__NOUN" 
##  [2,] "others__NOUN"     
##  [3,] "interaction__NOUN"
##  [4,] "person__NOUN"     
##  [5,] "compliance__NOUN" 
##  [6,] "players__NOUN"    
##  [7,] "actors__NOUN"     
##  [8,] "interaction__NOUN"
##  [9,] "interaction__NOUN"
## [10,] "interaction__NOUN"
## 
## $paternalism.json
##       [,1]              
##  [1,] "ends__NOUN"      
##  [2,] "agency__NOUN"    
##  [3,] "ground__NOUN"    
##  [4,] "tendency__NOUN"  
##  [5,] "tendencies__NOUN"
##  [6,] "tendencies__NOUN"
##  [7,] "persuasion__NOUN"
##  [8,] "tendencies__NOUN"
##  [9,] "capacities__NOUN"
## [10,] "capacities__NOUN"
## [11,] "propensity__NOUN"
## [12,] "capacities__NOUN"
## [13,] "creatures__NOUN" 
## 
## $plotinus.json
##      [,1]                
## [1,] "desire__NOUN"      
## [2,] "life__NOUN"        
## [3,] "agent__NOUN"       
## [4,] "universalism__NOUN"
## 
## $preferences.json.1
##      [,1]               
## [1,] "choice__NOUN"     
## [2,] "individuals__NOUN"
## [3,] "ground__NOUN"     
## [4,] "criticism__NOUN"  
## [5,] "criticism__NOUN"
rational_noun <- data.frame(match = do.call(rbind, rational_noun),
       doc_id = rep(names(rational_noun), lengths(rational_noun)))
rational_noun
## count them
rational_noun %>% 
  na.omit %>% 
  group_by(doc_id, match) %>% 
  summarise(n = n()) %>% 
  arrange(doc_id, desc(n)) %>% 
  print(n=200)
## # A tibble: 24 x 3
## # Groups:   doc_id [4]
##    doc_id                  match                  n
##    <chr>                   <chr>              <int>
##  1 contractarianism.json.1 interaction__NOUN      4
##  2 contractarianism.json.1 actors__NOUN           1
##  3 contractarianism.json.1 assessment__NOUN       1
##  4 contractarianism.json.1 compliance__NOUN       1
##  5 contractarianism.json.1 others__NOUN           1
##  6 contractarianism.json.1 person__NOUN           1
##  7 contractarianism.json.1 players__NOUN          1
##  8 paternalism.json        capacities__NOUN       3
##  9 paternalism.json        tendencies__NOUN       3
## 10 paternalism.json        agency__NOUN           1
## 11 paternalism.json        creatures__NOUN        1
## 12 paternalism.json        ends__NOUN             1
## 13 paternalism.json        ground__NOUN           1
## 14 paternalism.json        persuasion__NOUN       1
## 15 paternalism.json        propensity__NOUN       1
## 16 paternalism.json        tendency__NOUN         1
## 17 plotinus.json           agent__NOUN            1
## 18 plotinus.json           desire__NOUN           1
## 19 plotinus.json           life__NOUN             1
## 20 plotinus.json           universalism__NOUN     1
## 21 preferences.json.1      criticism__NOUN        2
## 22 preferences.json.1      choice__NOUN           1
## 23 preferences.json.1      ground__NOUN           1
## 24 preferences.json.1      individuals__NOUN      1

Additional material

Hierarchical clustering

## hierarchical clustering - get distances on normalized dfm
sfe_dist_mat <- dfm_weight(dfm_sfe, scheme = "prop") %>%
    textstat_dist(method = "euclidean") %>% 
    as.dist()
## hiarchical clustering the distance object
sfe_cluster <- hclust(sfe_dist_mat, method = 'ward.D')
# label with document names
sfe_cluster$labels <- gsub('\\.json(\\.[0-9])?', '', docnames(dfm_sfe))
## determine best numbers of clusters
# fviz_nbclust(as.matrix(sfe_dist_mat), FUN = hcut, method = "wss")
## cut tree into four groups
clusters <- cutree(sfe_cluster, k = 4)
## add cluster-data to the correspondence analysis
sfe_ca_hcl <- left_join(sfe_ca, data.frame(cluster = clusters, id = names(clusters)))
## plot
ggplot(sfe_ca_hcl, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=as.factor(cluster)), alpha = 0.2) +
  facet_grid(~as.factor(cluster))

## hierarchical clustering doesn't provide discrete cluster along
## the dimensions of the correspondance analysis

Cosine similarities for documents

## subset documents about logic
logic <- dfm_subset(dfm_sfe, grepl('(?<=\\-)logic|logic(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarity
logic_sim <- textstat_simil(logic, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .4
as.data.frame(logic_sim) %>% 
  filter(cosine > .4) %>% 
  arrange(desc(cosine))

> Exercise

Task

Redo the cosine similarities for another subset of documents.

Solution
## subset documents about aesthetics
aesth <- dfm_subset(dfm_sfe, grepl('aesthetics', docnames(dfm_sfe), perl = T))
## compute cosine similarity
aesth <- textstat_simil(aesth, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .2
as.data.frame(aesth) %>% 
  filter(cosine > .2) %>% 
  arrange(desc(cosine))

Cosine similarities for features

## subset documents about feminism
fem <- dfm_subset(dfm_sfe, grepl('(?<=\\-)fem|fem.*(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarities for the features 
## "empowerment", "embodiment", and "rape"
fem_sim <- textstat_simil(logic, logic[, c("empowerment", "embodiment", "rape")], 
                          margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>% 
  group_by(feature2) %>% 
  arrange(feature2, desc(cosine)) %>% 
  slice_head(n=5)

> Exercise

Task

Redo the cosine similarities for a different set of features.

Solution
fem_sim <- textstat_simil(logic, logic[, c("feminism", "patriarchy")], 
                          margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>% 
  group_by(feature2) %>% 
  arrange(feature2, desc(cosine)) %>% 
  slice_head(n=5)
 




A work by Lucien Baumgartner

https://lucienbaumgartner.github.io/" class="fa fa-home">